{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import xgboost as xgb\n",
    "import pickle\n",
    "import os\n",
    "import sys,random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists('featurescore'):\n",
    "    os.mkdir('featurescore')\n",
    "if not os.path.exists('model'):\n",
    "    os.mkdir('model')\n",
    "if not os.path.exists('preds'):\n",
    "    os.mkdir('preds')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 40.6 s, sys: 2.85 s, total: 43.4 s\n",
      "Wall time: 42 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#load data\n",
    "# 使用rank特征需要归一化\n",
    "train_x = pd.read_csv(\"../../preprocess_data/train_x_float_rank.csv\")\n",
    "train_y = pd.read_csv(\"../../preprocess_data/train_y_33465.csv\")\n",
    "\n",
    "train_x.drop([\"id\"], axis=1, inplace=True)\n",
    "train_x = train_x/len(train_x)\n",
    "dtrain = xgb.DMatrix(train_x, label=train_y)\n",
    "    \n",
    "valid = pd.read_csv(\"../../preprocess_data/valid_float_rank.csv\")\n",
    "valid_id = valid.id\n",
    "valid.drop(\"id\",axis=1,inplace=True)\n",
    "valid = valid/len(valid)\n",
    "dvalid = xgb.DMatrix(valid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pipeline(dtrain,dtest,test_id,iteration,random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight):\n",
    "    params={\n",
    "        'booster':'gbtree',\n",
    "        'objective': 'binary:logistic',\n",
    "#         'scale_pos_weight': float(len(y)-sum(y))/float(sum(y)), # sum(negative instances) / sum(positive instances)， for unbalanced samples\n",
    "        'eval_metric': 'auc',\n",
    "        'gamma':gamma,\n",
    "        'max_depth':max_depth,\n",
    "        'lambda':lambd,\n",
    "        'subsample':subsample,\n",
    "        'colsample_bytree':colsample_bytree,\n",
    "        'min_child_weight':min_child_weight, \n",
    "        'eta': 0.04,\n",
    "        'seed':random_seed,\n",
    "        'nthread':-1\n",
    "        }\n",
    "    \n",
    "    watchlist  = [(dtrain,'train')]\n",
    "    model = xgb.train(params,dtrain,num_boost_round=1350,evals=watchlist)\n",
    "    model.save_model('./model/xgb{0}.model'.format(iteration))\n",
    "    \n",
    "    #predict test set\n",
    "    test_y = model.predict(dtest)\n",
    "    test_result = pd.DataFrame(columns=[\"id\",\"score\"])\n",
    "    test_result.id = test_id\n",
    "    test_result.score = test_y\n",
    "    test_result.to_csv(\"./preds/xgb{0}.csv\".format(iteration),index=None,encoding='utf-8')\n",
    "    \n",
    "    #save feature score\n",
    "    feature_score = model.get_fscore()\n",
    "    feature_score = sorted(feature_score.items(), key=lambda x:x[1],reverse=True)\n",
    "    fs = []\n",
    "    for (key,value) in feature_score:\n",
    "        fs.append(\"{0},{1}\\n\".format(key,value))\n",
    "    \n",
    "    with open('./featurescore/feature_score_{0}.csv'.format(iteration),'w') as f:\n",
    "        f.writelines(\"feature,score\\n\")\n",
    "        f.writelines(fs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "random_seed = [x for x in range(1000,2000,10)]\n",
    "gamma = [i/1000.0 for i in range(100,200,1)]\n",
    "max_depth = [6,7,8]\n",
    "lambd = [x for x in range(100,200,1)]\n",
    "subsample = [i/1000.0 for i in range(500,700,2)]\n",
    "colsample_bytree = [i/1000.0 for i in range(250,350,1)]\n",
    "min_child_weight = [i/1000.0 for i in range(200,300,1)]\n",
    "random.shuffle(random_seed)\n",
    "random.shuffle(gamma)\n",
    "random.shuffle(max_depth)\n",
    "random.shuffle(lambd)\n",
    "random.shuffle(subsample)\n",
    "random.shuffle(colsample_bytree)\n",
    "random.shuffle(min_child_weight)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "#save params for reproducing\n",
    "with open('params.pkl','wb') as f:\n",
    "    pickle.dump((random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight),f)\n",
    "\n",
    "#to reproduce my result, uncomment following lines\n",
    "\"\"\"\n",
    "with open('params_for_reproducing.pkl','r') as f:\n",
    "    random_seed,gamma,max_depth,lambd,subsample,colsample_bytree,min_child_weight = pickle.load(f)    \n",
    "\"\"\"\n",
    "#train 100 xgb\n",
    "\n",
    "for i in range(100):\n",
    "    pipeline(dtrain,dvalid,valid_id,i,random_seed[i],gamma[i],max_depth[i%3],lambd[i],subsample[i],colsample_bytree[i],min_child_weight[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
